Source code for nlp_architect.data.cdc_resources.wikipedia.wiki_offline

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************

import logging
from os import listdir
from os.path import join, isfile

from nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page import WikipediaPage
from nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page_extracted_relations import \
    WikipediaPageExtractedRelations
from nlp_architect.data.cdc_resources.wikipedia.wiki_search_page_result import \
    WikipediaSearchPageResult
from nlp_architect.utils.io import load_json_file

logger = logging.getLogger(__name__)


[docs]class WikiOffline(object): def __init__(self, wikidump): if wikidump: self.dump = self.load_dump(wikidump) logger.info('Wikipedia dump loaded successfully!')
[docs] def get_pages(self, phrase): if phrase and phrase in self.dump: pages = self.dump[phrase] if pages: return pages return set()
[docs] @staticmethod def extract_json_values(json_pages): pages = set() for json_page in json_pages: description = json_page.get('description', None) pageid = int(json_page.get('pageid', 0)) orig_phrase = json_page.get('orig_phrase', None) orig_phrase_norm = json_page.get('orig_phrase_norm', None) wiki_title = json_page.get('wiki_title', None) wiki_title_norm = json_page.get('wiki_title_norm', None) relations_json = json_page.get('relations', None) rel_is_part_name = relations_json.get('isPartName', None) rel_is_disambiguation = relations_json.get('isDisambiguation', None) rel_disambiguation = relations_json.get('disambiguationLinks', None) rel_disambiguation_norm = relations_json.get('disambiguationLinksNorm', None) rel_parenthesis = relations_json.get('titleParenthesis', None) rel_parenthesis_norm = relations_json.get('titleParenthesisNorm', None) rel_categories = relations_json.get('categories', None) rel_categories_norm = relations_json.get('categoriesNorm', None) rel_be_comp = relations_json.get('beCompRelations', None) rel_be_comp_norm = relations_json.get('beCompRelationsNorm', None) rel_aliases = relations_json.get('aliases', None) rel_aliases_norm = relations_json.get('aliasesNorm', None) relations = WikipediaPageExtractedRelations(rel_is_part_name, rel_is_disambiguation, rel_parenthesis, rel_disambiguation, rel_categories, rel_aliases, rel_be_comp, rel_disambiguation_norm, rel_categories_norm, rel_aliases_norm, rel_parenthesis_norm, rel_be_comp_norm) page = WikipediaPage(orig_phrase, orig_phrase_norm, wiki_title, wiki_title_norm, 0, pageid, description, relations) pages.add(WikipediaSearchPageResult(orig_phrase, page)) return pages
[docs] def load_dump(self, wiki_dump): onlyfiles = [] for _file in listdir(wiki_dump): file_path = join(wiki_dump, _file) if isfile(file_path): onlyfiles.append(file_path) json_dump_list = {} for _file in onlyfiles: json_dump_list.update(load_json_file(_file)) dump_final = {} for key, value in json_dump_list.items(): dump_final[key] = self.extract_json_values(value) return dump_final
[docs] class NoPage(object): """ Attribute not found. """ def __init__(self, *args, **kwargs): # real signature unknown pass @staticmethod # known case of __new__ def __new__(S, *more): # real signature unknown; restored from __doc__ """ T.__new__(S, ...) -> a new object with type S, a subtype of T """